set.seed(12991292);
options(scipen = 50);

red.1 <- "#f30f13"; # bright
red.2 <- "#d90d11"; # darker
red.3 <- "#bb0c0f"; # even darker
red.4 <- "#900b0d"; # darkest

# NB: A "plots" directory must exist for the (PDF) plots.
#
if (exists(as.character(substitute(reglinux))) == 0) {

    diff.time <- function(x, y, fmt = NULL) {

        if (is.null(fmt) != F)
            fmt = "%d-%m-%Y"

        delta <- as.Date(x, format = fmt) -
            as.Date(y, format = fmt);

        return(as.vector(delta));
    }

    reglinux = read.csv("reglinux.csv");
    reglinux$diff <- diff.time(reglinux$endDate,
                               reglinux$startDate,
                               fmt = "%d-%m-%Y");

    print(paste("n =", nrow(reglinux)));
    stopifnot(sum(length(which(reglinux$diff < 0))) == 0);

    tmp <- as.character(reglinux$startDate);
    reglinux$year <- rep(NA, nrow(reglinux));

    for (i in 1:length(tmp)) {
        reglinux$year[i] <- strsplit(as.character(tmp[i]), "-")[[1]][3];
    }

    tmp <- reglinux$subsystem;
    reglinux$subsystem.recoded <- as.character(reglinux$subsystem);
    reglinux$subsystem.recoded[which(tmp == "sound")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "security")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "net")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "mm")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "kernel")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "fs")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "crypto")] <- "others";
    reglinux$subsystem.recoded[which(tmp == "block")] <- "others";
    reglinux$subsystem.recoded <- as.factor(reglinux$subsystem.recoded);
    reglinux$subsystem.recoded <- relevel(reglinux$subsystem.recoded, "drivers");

    reglinux$year <- as.factor(reglinux$year);
    reglinux$subsystem <- as.factor(reglinux$subsystem);
    reglinux$subsystem <- relevel(reglinux$subsystem, "drivers");
    reglinux$revtest <- as.factor(reglinux$revtest);
    reglinux$acked <- as.factor(reglinux$acked);
    reglinux$suggested <- as.factor(reglinux$suggested);
    reglinux$year <- as.factor(reglinux$year);

    reglinux$bindiff <- rep(1, nrow(reglinux));
    reglinux$bindiff[which(reglinux$diff < median(reglinux$diff))] <- 0;
    reglinux$bindiff <- as.factor(reglinux$bindiff);

    rm(tmp);
}

plot.years <- function(eps) {

    if (eps != 0) {
        graphics.off();
        plot.pdf("plots/fig_years.pdf", 3.5, 8, 16);
    }

    par(mar = c(4, 5, 1, 1), font.main = 1);

    x <- as.character(reglinux$startDate);
    years <- seq(2021, 2024);
    v <- vector(length = length(years));
    names(v) <- years;

    for (i in 1:length(x)) {
        year <- strsplit(as.character(x[i]), "-")[[1]][3];
        idx <- which(names(v) == year);
        v[idx] <- v[idx] + 1;
    }

    kol <- c(red.1, red.2, red.3, red.3);
    barplot(v, col = kol, yaxt = "n", ylim = c(0, 400));

    box();
    axis(2, las = 2);
    mtext("Year", side = 1, line = 2.5);
    mtext("Frequency", side = 2, line = 3.5);

    legend("topleft", c(paste("n =", nrow(reglinux)),
                        paste("Mean across years =", mean(v))), bty = "n");

    if (eps != 0)
        graphics.off();
}

plot.difftime <- function(eps) {

    if (eps != 0) {
        graphics.off();
        plot.pdf("plots/fig_diff_time.pdf", 4.5, 8, 18);
    }

    par(mfrow = c(1, 2), mar = c(5, 5, 0, 0), font.main = 1);

    hist(reglinux$diff, ylim = c(0, 900),
         col = red.1, yaxt = "n",
         main = "", xlab = "");
    axis(2, las = 2);
    mtext("Bug fixing times (days)", side = 1, line = 3);

    legend("topright",
           c(paste("Mean =", round(mean(reglinux$diff), 0), "days"),
             paste("Median =", round(median(reglinux$diff), 0), "days"),
             paste("Std. dev. =", round(sd(reglinux$diff), 0), "days")),
           bty = "n", cex = 0.9);

    hist(log(reglinux$diff), ylim = c(0, 200),
         col = red.3, yaxt = "n",
         main = "", xlab = "");
    axis(2, las = 2);
    abline(v = median(log(reglinux$diff)), lwd = 2);
    mtext("A logarithm of bug\nfixing times (days)", side = 1, line = 3.5);

    arrows(3.5, 190, 2.7, 190, length = 0.07);
    text(4.7, 190, "Median", cex = 0.9);

    if (eps != 0)
        graphics.off();
}

plot.subsystem <- function(eps) {

    if (eps != 0) {
        graphics.off();
        plot.pdf("plots/fig_subsystem.pdf", 4, 8, 16);
    }

    par(mfrow = c(1, 1), mar = c(5, 5, 1, 0), font.main = 1);

    tab <- sort(table(reglinux$subsystem), decreasing = T);

    barplot(tab,
            las = 2, ylim = c(0, 600), col = red.2);
    mtext("Frequency", side = 2, line = 3.5);

    legend("topright",
           c(paste("Mean across subsystems =", round(mean(tab), 0)),
             paste("Median across subsystems =", round(median(tab), 0))),
           bty = "n", cex = 0.95);

    abline(h = mean(tab));

    barplot(tab, add = T,
            las = 2, ylim = c(0, 600), col = red.2);

    arrows(10, 150, 10, 95, length = 0.07);
    text(10, 185, "Mean", cex = 0.95);

    if (eps != 0)
        graphics.off();
}

plot.subsystem.diff <- function(eps) {

    if (eps != 0) {
        graphics.off();
        plot.pdf("plots/fig_subsystem_diff.pdf", 4, 8, 16);
    }

    par(mfrow = c(1, 1), mar = c(5, 5, 1, 0), font.main = 1);

    boxplot(reglinux$diff ~ reglinux$subsystem,
            las = 2, col = red.1, xlab = "", ylab = "");

    mtext("Bug fxing times (days)", side = 2, line = 3.5);

    kw <- kruskal.test(reglinux$diff ~ reglinux$subsystem);
    print(kw);

    if (eps != 0)
        graphics.off();
}

plot.testing <- function(eps) {

    if (eps != 0) {
        graphics.off();
        plot.pdf("plots/fig_revtest_subsystem.pdf", 4, 9.7, 16);
    }

    par(mfrow = c(1, 1), mar = c(7, 3, 1, 0),
        font.main = 1, las = 2, cex.lab = 1);

    plot(reglinux$revtest ~ reglinux$subsystem,
         ylab = "", yaxt = "n", xlab = "", col = c(red.1, red.4));

    par(new = F, las = 1);
    mtext("REVTEST", side = 2, line = 2, las = 3);
    mtext("SUBSYSTEM", side = 1, line = 5);

    if (eps != 0)
        graphics.off();
}

do.reg <- function() {

    require(fmsb);
    require(MASS);
    require(bestglm);

    f.1 <- formula(diff ~ reporters);
    f.2 <- formula(diff ~ reporters + bugtracker);
    f.3 <- formula(diff ~ reporters + bugtracker + references);
    f.4 <- formula(diff ~ reporters + bugtracker + references + entropy);
    f.5 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest);
    f.6 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                   + acked);
    f.7 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                   + acked + suggested);
    f.8 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                   + acked + suggested + modfile);
    f.9 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                   + acked + suggested + modfile + locadd);
    f.10 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                   + acked + suggested + modfile + locadd + locdel);
    f.11 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                   + acked + suggested + modfile + locadd + locdel + subsystem);
    f.12 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                    + acked + suggested + modfile + locadd + locdel + subsystem + year);

    # The one based on model selection and an interaction model.
    #
    f.x.1 <- formula(diff ~ references + entropy + locadd + subsystem.recoded);
    f.x.2 <- formula(diff ~ reporters + bugtracker + references + entropy + revtest
                     + acked + suggested + modfile + locadd + locdel + subsystem
                     + year + subsystem:revtest);

    do.glm <- function(f, pr = F, best = F, lab = NULL) {

        r <- glm.nb(f, data = reglinux,
                    control = glm.control(maxit = 1000, trace = F));

        m <- matrix(nrow = 1, ncol = 3);
        m[1, 1] <- length(coefficients(r));
        m[1, 2] <- round(as.numeric(NagelkerkeR2(r)[2]), 2);
        m[1, 3] <- round(BIC(r), 0);

        if (pr == T) {
            print(paste(lab, "MODEL:"));
            print(summary(r));
            print(paste("BIC =", m[1, 3]));
            print(paste("Nagelkerke R^2 =", m[1, 2]));
        }

        # "The argument Xy is usually a data-frame containing in the first
        #  p columns the design matrix and in the last column the response."
        #
        if (best == T) {

            x <- reglinux[, c("reporters",
                              "bugtracker",
                              "references",
                              "entropy",
                              "revtest",
                              "acked",
                              "suggested",
                              "modfile",
                              "locadd",
                              "locdel",
                              "subsystem.recoded",
                              "year",
                              "diff")];

            # The required dispersion parameter is taken
            # from the glm.nb() fit for the full model.
            #
            rbest <- bestglm(x, IC = "BIC",
                             family = negative.binomial(theta = 0.9331));

            print("MODEL SELECTION:");
            print(summary(rbest$BestModel));
        }

        return (m);
    }

    m <- matrix(nrow = 12, ncol = 3);
    colnames(m) <- c("k", "R^2", "BIC");

    m[1, ] <- do.glm(f.1, F, F);
    m[2, ] <- do.glm(f.2, F, F);
    m[3, ] <- do.glm(f.3, F, F);
    m[4, ] <- do.glm(f.4, F, F);
    m[5, ] <- do.glm(f.5, F, F);
    m[6, ] <- do.glm(f.6, F, F);
    m[7, ] <- do.glm(f.7, F, F);
    m[8, ] <- do.glm(f.8, F, F);
    m[9, ] <- do.glm(f.9, F, F);
    m[10, ] <- do.glm(f.10, F, F);
    m[11, ] <- do.glm(f.11, F, F);
    m[12, ] <- do.glm(f.12, T, T, "FULL");

    do.glm(f.x.1, T, F, "PARSIMONIOUS");
    do.glm(f.x.2, T, F, "INTERACTION");

    print(m);
}

do.classification <- function() {

    require(caret);
    require(kernlab);
    require(naivebayes);
    require(randomForest);

    ctrl <- trainControl(method = "cv", number = 5,
                         verboseIter = F, allowParallel = T);

    x <- reglinux[, c("reporters",
                      "bugtracker",
                      "references",
                      "entropy",
                      "revtest",
                      "acked",
                      "suggested",
                      "modfile",
                      "locadd",
                      "locdel",
                      "subsystem",
                      "year",
                      "bindiff")];

    test.idx <- sample.int(floor(nrow(x) * 0.1));
    x.test <- x[test.idx, ];
    x.train <- x[-test.idx, ];

    # Naive bayes.
    #
    r.t.nb <- train(bindiff ~ ., data = x.train, method = "naive_bayes",
                    metric = "Accuracy", trControl = ctrl);
    r.p.nb <- predict(r.t.nb, newdata = x.test);
    r.c.nb <- confusionMatrix(r.p.nb, x.test$bindiff);

    # Random forest.
    #
    r.t.rf <- train(bindiff ~ ., data = x.train, method = "rf",
                    metric = "Accuracy", trControl = ctrl);
    r.p.rf <- predict(r.t.rf, newdata = x.test);
    r.c.rf <- confusionMatrix(r.p.rf, x.test$bindiff);

    # Support vector machine.
    #
    r.t.svm <- train(bindiff ~ ., data = x.train, method = "svmLinear",
                     metric = "Accuracy", trControl = ctrl);
    r.p.svm <- predict(r.t.svm, newdata = x.test);
    r.c.svm <- confusionMatrix(r.p.svm, x.test$bindiff);

    m <- matrix(nrow = 3, ncol = 4);
    colnames(m) <- c("Precision", "Recall", "Accuracy", "F1");
    rownames(m) <- c("Naive bayes", "Random forest", "SVM");

    m[1, 1] <- r.c.nb$byClass[5];
    m[1, 2] <- r.c.nb$byClass[6];
    m[1, 4] <- r.c.nb$byClass[11];
    m[1, 3] <- r.c.nb$byClass[7];

    m[2, 1] <- r.c.rf$byClass[5];
    m[2, 2] <- r.c.rf$byClass[6];
    m[2, 4] <- r.c.rf$byClass[11];
    m[2, 3] <- r.c.rf$byClass[7];

    m[3, 1] <- r.c.svm$byClass[5];
    m[3, 2] <- r.c.svm$byClass[6];
    m[3, 4] <- r.c.svm$byClass[11];
    m[3, 3] <- r.c.svm$byClass[7];

    print(m);
}

eps <- 1;
plot.years(eps);
plot.difftime(eps);
plot.subsystem(eps);
plot.subsystem.diff(eps);
plot.testing(eps);
do.reg();
do.classification();
